##loading the data and libraries
library(tidyverse)
library(janitor)
neighbourhood_rating <- read_csv("../raw_data/neighbourhood_rating.csv") %>%
clean_names()
Rows: 38055 Columns: 13── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (11): FeatureCode, Measurement, Units, Neighbourhood rating, Gender, Urban Rural Classification, SIMD quintiles, Typ...
dbl (2): DateCode, Value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
green_spaces <- read_csv("../raw_data/green_spaces.csv") %>%
clean_names()
Rows: 38451 Columns: 13── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (11): FeatureCode, Measurement, Units, Distance to Nearest Green or Blue Space, Age, Gender, Urban Rural Classificat...
dbl (2): DateCode, Value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
community_belonging <-read_csv("../raw_data/community_belonging.csv") %>%
clean_names()
Rows: 43611 Columns: 13── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (11): FeatureCode, Measurement, Units, Community belonging, Gender, Urban Rural Classification, SIMD quintiles, Type...
dbl (2): DateCode, Value
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
neighbourhood_rating
green_spaces
community_belonging
#Great, no missing values
#checking missing values
green_spaces %>%
summarise(count = sum(is.na(green_spaces)))
community_belonging %>%
summarise(count = sum(is.na(community_belonging)))
neighbourhood_rating %>%
summarise(count = sum(is.na(neighbourhood_rating)))
neighbourhood_rating %>%
count(date_code)
neighbourhood_rating %>%
count(walking_distance_to_nearest_greenspace)
neighbourhood_rating %>%
count(type_of_tenure)
neighbourhood_rating %>%
count(type_of_tenure)
neighbourhood_rating %>%
count(measurement)
neighbourhood_rating %>%
count(units)
neighbourhood_rating %>%
count(simd_quintiles)
community %>%
count(measurement)
Error in count(., measurement) : object 'community' not found
#council areas codes
council_areas <- read_csv("../raw_data/967937c4-8d67-4f39-974f-fd58c4acfda5.csv") %>%
clean_names()
Rows: 44 Columns: 14── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): CA, CAName, HSCP, HSCPName, HB, HBName, Country
dbl (7): _id, CADateEnacted, CADateArchived, HSCPDateEnacted, HSCPDateArchived, HBDateEnacted, HBDateArchived
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
council_areas<- council_areas %>%
select(ca, ca_name)
council_areas %>%
rename(feature_code = ca)
green_spaces_joined <- inner_join(
green_spaces, council_areas, by = c("feature_code" = "ca"))
green_spaces_joined
community_belonging_joined <- inner_join(
community_belonging, council_areas, by = c("feature_code" = "ca"))
community_belonging_joined
neighbourhood_rating_joined <- inner_join(
neighbourhood_rating, council_areas, by = c("feature_code" = "ca"))
neighbourhood_rating_joined
neighbourhood_rating_joined %>%
filter(walking_distance_to_nearest_greenspace == "More than 10 minutes") %>%
count(value)
neighbourhood_rating_joined %>%
filter(walking_distance_to_nearest_greenspace == "Less than 10 minutes") %>%
count(neighbourhood_rating)
neighbourhood_rating_joined %>%
filter(walking_distance_to_nearest_greenspace == "More than 10 minutes") %>%
count(neighbourhood_rating)
neighbourhood_rating_joined %>%
filter(walking_distance_to_nearest_greenspace == "Less than 10 minutes") %>%
count(ca_name) %>%
arrange(desc(n))
neighbourhood_rating_joined %>%
filter(walking_distance_to_nearest_greenspace == "More than 10 minutes") %>%
count(ca_name) %>%
arrange(desc(n))
neighbourhood_rating_joined %>%
count(ca_name) %>%
arrange(desc(n))
neighbourhood_rating_joined %>%
filter(date_code == "2019",
walking_distance_to_nearest_greenspace == "Less than 10 minutes") %>%
count(ca_name) %>%
arrange(desc(n))
neighbourhood_rating_joined %>%
filter(date_code == "2019",
walking_distance_to_nearest_greenspace == "More than 10 minutes") %>%
count(ca_name) %>%
arrange(desc(n))
neighbourhood_rating_joined %>%
filter(date_code == "2019") %>%
count(ca_name) %>%
arrange(desc(n))
neighbourhood_rating_joined %>%
filter(date_code == "2018") %>%
count(ca_name) %>%
arrange(desc(n))
neighbourhood_rating_joined %>%
count(date_code)
neighbourhood_rating_joined %>%
filter(date_code == "2017") %>%
count(ca_name) %>%
arrange(desc(n))
neighbourhood_rating_joined %>%
filter(date_code == "2016") %>%
count(ca_name) %>%
arrange(desc(n))
green_spaces_joined %>%
count(gender)
##Are there certain groups that have/ lack local access to green space?
green_spaces_joined
green_spaces_joined %>%
count(distance_to_nearest_green_or_blue_space)
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space == "A 5 minute walk or less") %>%
count(gender)
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space == "An 11 minute walk or more") %>%
count(gender)
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space == "Within a 6-10 minute walk") %>%
count(gender)
#age access
green_spaces_joined %>%
count(age)
#age access
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space == "A 5 minute walk or less",
date_code == "2018") %>%
count(age)
#age access
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space == "An 11 minute walk or more") %>%
count(age)
green_spaces_joined
#simd_quintiles
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space == "A 5 minute walk or less",
date_code == "2013") %>%
count(simd_quintiles)
#simd_quintiles
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space == "An 11 minute walk or more") %>%
count(simd_quintiles)
green_spaces_joined
#simd_quintiles
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space == "A 5 minute walk or less",
date_code == "2019") %>%
count(simd_quintiles)
green_spaces_joined %>%
mutate(age = factor(age, levels = c("16-34 years", "35-64 years",
"65 years and over", "All")),
simd_quintiles = factor(simd_quintiles, levels = c("20% most deprived",
"80% least deprived",
"All")),
distance_to_nearest_green_or_blue_space = factor(
distance_to_nearest_green_or_blue_space, levels = c(
"A 5 minute walk or less",
"Within a 6-10 minute walk",
"An 11 minute walk or more",
"Don't Know"
)
))
#plotting simd_quintiles for every year to see if there're significant
#differences for each year
#Filtering out Simd_quintiles == "All"
green_spaces_joined %>%
filter(simd_quintiles != "All") %>%
ggplot(aes(x = distance_to_nearest_green_or_blue_space, fill = simd_quintiles))+
geom_bar(position = "dodge")+
facet_wrap(~date_code)
#plotting excluding outliers
green_spaces_joined %>%
filter(measurement == "Percent") %>%
ggplot(aes(x = simd_quintiles, fill = distance_to_nearest_green_or_blue_space))+
geom_bar(position = "dodge")+
facet_wrap(~date_code)
#plotting excluding outliers for total years
green_spaces_joined %>%
ggplot(aes(x = distance_to_nearest_green_or_blue_space, fill = simd_quintiles))+
geom_bar(position = "dodge")
green_spaces_joined %>%
group_by(age, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = age, y = mean_percentage, fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'age'. You can override using the `.groups` argument.
green_spaces_joined %>%
#filter(measurement == "Percent") %>%
group_by(ca_name, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = ca_name, y = mean_percentage, fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")+
facet_wrap(~ca_name)
`summarise()` has grouped output by 'ca_name'. You can override using the `.groups` argument.
#simd quinitles and distance to green spaces
green_spaces_joined %>%
filter(simd_quintiles != "All") %>%
group_by(simd_quintiles, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = simd_quintiles, y = mean_percentage, fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'simd_quintiles'. You can override using the `.groups` argument.
green_spaces_joined %>%
#filter(measurement == "Percent") %>%
group_by(, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = simd_quintiles, y = mean_percentage, fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
Error in FUN(X[[i]], ...) : object 'simd_quintiles' not found
#subsetting the All
green_spaces_joined %>%
#filter(measurement == "Percent") %>%
group_by(simd_quintiles, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(subset(green_spaces_joined, simd_quintiles %in% c("All")))+
geom_col(aes(simd_quintiles, mean_percentage, fill = distance_to_nearest_green_or_blue_space))
`summarise()` has grouped output by 'simd_quintiles'. You can override using the `.groups` argument.
Error in `ggplot()`:
! Mapping should be created with `aes()` or `aes_()`.
Backtrace:
1. ... %>% ...
3. ggplot2:::ggplot.default(., subset(green_spaces_joined, simd_quintiles %in% c("All")))
green_spaces_joined %>%
#filter(measurement == "Percent") %>%
group_by(gender, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = gender, y = mean_percentage, fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
green_spaces_joined %>%
group_by(urban_rural_classification, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = urban_rural_classification, y = mean_percentage, fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'urban_rural_classification'. You can override using the `.groups` argument.
#type of tenure
green_spaces_joined %>%
filter(type_of_tenure != "All") %>%
group_by(type_of_tenure, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = type_of_tenure, y = mean_percentage, fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'type_of_tenure'. You can override using the `.groups` argument.
#household type
green_spaces_joined %>%
filter(household_type != "All") %>%
group_by(household_type, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = household_type, y = mean_percentage, fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'household_type'. You can override using the `.groups` argument.
#ethnicity
green_spaces_joined %>%
group_by(ethnicity, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = ethnicity, y = mean_percentage, fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'ethnicity'. You can override using the `.groups` argument.
#ethnicity and trying to subset "All"
green_spaces_joined %>%
group_by(ethnicity, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(subset(ethnicity %in% "All"), aes(x = ethnicity, y = mean_percentage, fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'ethnicity'. You can override using the `.groups` argument.Error in ethnicity %in% "All" : object 'ethnicity' not found
ggplot(subset(green_spaces_joined,
ethnicity %in% "All"), aes(x = ethnicity, y = value, fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
#ethnicity
green_spaces_joined %>%
group_by(ethnicity, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = ethnicity(-c("All")),
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'ethnicity'. You can override using the `.groups` argument.Error in ethnicity(-c("All")) : could not find function "ethnicity"
#ethnicity
green_spaces_joined %>%
group_by(ethnicity(-c("All")), distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = ethnicity,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
Error in `group_by()`:
! Problem adding computed columns.
Caused by error in `mutate()`:
! Problem while computing `..1 = ethnicity(-c("All"))`.
Caused by error in `ethnicity()`:
! could not find function "ethnicity"
Backtrace:
1. ... %>% ...
5. dplyr:::group_by.data.frame(., ethnicity(-c("All")), distance_to_nearest_green_or_blue_space)
6. dplyr::group_by_prepare(.data, ..., .add = .add, caller_env = caller_env())
7. dplyr:::add_computed_columns(...)
9. dplyr:::mutate_cols(...)
11. mask$eval_all_mutate(quo)
#ethnicity
green_spaces_joined %>%
filter(ethnicity != "All") %>%
group_by(ethnicity, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = ethnicity,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'ethnicity'. You can override using the `.groups` argument.
#ethnicity
green_spaces_joined %>%
filter(ethnicity != "All") %>%
group_by(ethnicity, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = ethnicity,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'ethnicity'. You can override using the `.groups` argument.
green_spaces_joined
#hypothesis test and gender
green_spaces_gender <- green_spaces_joined %>%
filter(gender != "All",
measurement == "Percent")
library(infer)
null_distribution <- green_spaces_gender %>%
specify(response = value) %>%
hypothesize(null = "point", mu = 3.93) %>%
generate(reps = 10000, type = "bootstrap") %>%
calculate(stat = "mean")
null_distribution %>%
visualise(bins = 30)
observed_gender_stat <- green_spaces_gender %>%
summarise(mean_rating = mean(value))
null_distribution %>%
visualise(bins = 30) +
shade_p_value(obs_stat = observed_gender_stat$mean_rating, direction = "both")
p_value <- null_distribution %>%
get_p_value(obs_stat = observed_gender_stat$mean_rating, direction = "both")
Warning: Please be cautious in reporting a p-value of 0. This result is an approximation based on the number of `reps` chosen in the `generate()` step. See `?get_p_value()` for more information.
p_value
#gender and distance to green spaces
green_spaces_joined %>%
filter(gender != "All") %>%
group_by(gender, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = gender,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
#gender and distance to green spaces
green_spaces_joined %>%
filter(urban_rural_classification != "All") %>%
group_by(urban_rural_classification, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = urban_rural_classification,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'urban_rural_classification'. You can override using the `.groups` argument.
#working on neighbourhood rating
neighbourhood_rating_joined
community_belonging_joined
neighbourhood_rating_joined %>%
count(neighbourhood_rating)
#interesting data, maybe I’ll need to bin the data into good and poor, the difference is maybe more evident or perhaps I can plot the differences in two different plots.
neighbourhood_rating_joined %>%
filter(walking_distance_to_nearest_greenspace != "All") %>%
group_by(neighbourhood_rating, walking_distance_to_nearest_greenspace) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = neighbourhood_rating,
y = mean_percentage,
fill = walking_distance_to_nearest_greenspace))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'neighbourhood_rating'. You can override using the `.groups` argument.
neighbourhood_rating_joined %>%
filter(walking_distance_to_nearest_greenspace != "All",
neighbourhood_rating == "Very good") %>%
group_by(neighbourhood_rating, walking_distance_to_nearest_greenspace) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = neighbourhood_rating,
y = mean_percentage,
fill = walking_distance_to_nearest_greenspace))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'neighbourhood_rating'. You can override using the `.groups` argument.
neighbourhood_rating_joined %>%
filter(walking_distance_to_nearest_greenspace != "All",
neighbourhood_rating == "Very poor") %>%
group_by(neighbourhood_rating, walking_distance_to_nearest_greenspace) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = neighbourhood_rating,
y = mean_percentage,
fill = walking_distance_to_nearest_greenspace))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'neighbourhood_rating'. You can override using the `.groups` argument.
#binning into good and very good and poor and very poor
#urban/ rural areas and distance to green spaces
green_spaces_joined %>%
filter(urban_rural_classification != "All") %>%
group_by(urban_rural_classification, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = urban_rural_classification,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'urban_rural_classification'. You can override using the `.groups` argument.
#urban/ rural areas and distance to green spaces
green_spaces_joined %>%
filter(urban_rural_classification != "All",
measurement == "Percent") %>%
group_by(urban_rural_classification, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = urban_rural_classification,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_col(position = "dodge")
`summarise()` has grouped output by 'urban_rural_classification'. You can override using the `.groups` argument.
#let’s do hypothesis testing on urban and rural
green_spaces_joined
green_spaces_joined %>%
count(measurement)
#aggregate responses dataset
read_csv("../raw_data/shs_aggregate_responses (1).csv")
Rows: 58530 Columns: 13── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (10): community_belonging, neighbourhood_rating, distance_to_nearest_green_space, satisfaction_with_nearest_green_sp...
dbl (3): year, household_size, n_persons
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##time series, how does access to green spaces change over time?
green_spaces_joined
green_spaces_joined %>%
count(distance_to_nearest_green_or_blue_space)
green_spaces_joined %>%
filter(walking_distance_to_nearest_greenspace != "All") %>%
group_by(neighbourhood_rating, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = neighbourhood_rating,
y = mean_percentage,
fill = walking_distance_to_nearest_greenspace))+
geom_col(position = "dodge")
green_spaces_joined %>%
filter(simd_quintiles != "All") %>%
group_by(date_code, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = date_code, y = mean_percentage, fill = distance_to_nearest_green_or_blue_space))+
geom_line(position = "dodge")+
facet_wrap(~ distance_to_nearest_green_or_blue_space)
`summarise()` has grouped output by 'date_code'. You can override using the `.groups` argument.
#plotting simd_quintiles for every year to see if there're significant
#differences for each year
#Filtering out Simd_quintiles == "All"
green_spaces_joined %>%
filter(simd_quintiles != "All") %>%
group_by(simd_quintiles, date_code, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = distance_to_nearest_green_or_blue_space, y = mean_percentage, fill = simd_quintiles))+
geom_line()+
facet_wrap(~date_code)
`summarise()` has grouped output by 'simd_quintiles', 'date_code'. You can override using the `.groups` argument.
#plotting the highest and lowest walkikng distances
#5 minute and 11 minute or more walk over time
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space == "A 5 minute walk or less" |
distance_to_nearest_green_or_blue_space == "An 11 minute walk or more") %>%
group_by(date_code, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = date_code,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_line()
#11 minute walk over time
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space == "An 11 minute walk or more") %>%
group_by(date_code, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = date_code,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_line()
#5 minute walk over time
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space == "A 5 minute walk or less") %>%
group_by(date_code, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = date_code,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_line()
#11 minute walk over time
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space == "Within a 6-10 minute walk") %>%
group_by(date_code, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = date_code,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_line()
#All walks over time excluding "All"
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space != "All") %>%
group_by(date_code, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = date_code,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_line()
#All walks over time excluding "All" and "Don't Know"
green_spaces_joined %>%
filter(distance_to_nearest_green_or_blue_space != "All") %>%
group_by(date_code, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = date_code,
y = mean_percentage,
fill = distance_to_nearest_green_or_blue_space))+
geom_point()+
geom_line()
`summarise()` has grouped output by 'date_code'. You can override using the `.groups` argument.
green_spaces_joined %>%
filter(ca_name %in% c("West Dunbartonshire", "East Lothian") &
distance_to_nearest_green_or_blue_space != "Don't Know" &
measurement == "Percent") %>%
group_by(ca_name, date_code, distance_to_nearest_green_or_blue_space) %>%
summarise(mean_percentage = mean(value)) %>%
ggplot(aes(x = factor(date_code),
y = mean_percentage,
colour = distance_to_nearest_green_or_blue_space)) +
geom_point()+
geom_line(aes(x = factor(date_code),
y = mean_percentage,
group = distance_to_nearest_green_or_blue_space))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
facet_wrap(~ca_name)
green_spaces_joined
neighbourhood_rating_joined
#is there a way to predict which households would have higher rating?
#aggregate response data
aggregate_responses <- read_csv("../raw_data/shs_aggregate_responses (1).csv")
Rows: 58530 Columns: 13── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (10): community_belonging, neighbourhood_rating, distance_to_nearest_green_space, satisfaction_with_nearest_green_s...
dbl (3): year, household_size, n_persons
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
aggregate_responses %>%
count(community_belonging)
aggregate_responses %>%
summarise(count = sum(is.na(satisfaction_with_nearest_green_space)))
aggregate_responses %>%
count(satisfaction_with_nearest_green_space)
aggregate_responses %>%
summarise(count = sum(is.na(age)))
aggregate_responses
aggregate_responses %>%
count(highest_education_level)
aggregate_responses_adjusted <- aggregate_responses %>%
select(c(community_belonging, neighbourhood_rating,
distance_to_nearest_green_space, satisfaction_with_nearest_green_space,
age, gender, economic_status, household_size, highest_education_level)) %>%
mutate(community_belonging = factor(community_belonging,
levels = c("Not at all strongly", "Not very strongly", "Don't know",
"Fairly strongly",
"Very strongly")),
neighbourhood_rating = factor(neighbourhood_rating,
levels = c("Very poor", "Fairly poor", "No opinion", "Fairly good", "Very good")),
distance_to_nearest_green_space = factor(distance_to_nearest_green_space, levels =
c("A 5 minute walk or less",
"Within a 6-10 minute walk",
"Within an 11-20 minute walk",
"Within a 21-30 minute walk",
"More than a 30 minute walk away",
"Don't know")),
age = factor(age, levels = c("16 - 34 Years", "35 - 64 Years", "65 + Years")),
gender = factor(gender, levels = c("Female", "Male")),
economic_status = factor(economic_status, levels = c("Full Time Employment", "Other", "Part Time Employment", "Retired", "Self Employed", "Training")),
highest_education_level = factor(highest_education_level, levels =
c("Standard grade or equiv (SVQ level 1 or 2).",
"Higher, A level or equivalent (SVQ Level 3)",
"HNC/HND or equivalent (SVQ Level 4)",
"Degree, Professional qualification (Above SVQ Level 4)",
"Other qualification")))
aggregate_responses_adjusted
modela <- lm(formula = household_size ~ neighbourhood_rating, data = aggregate_responses)
library(ggfortify)
autoplot(modela)
modelb <- lm(formula = neighbourhood_rating ~ satisfaction_with_nearest_green_space, data = aggregate_responses)
Warning: using type = "numeric" with a factor response will be ignoredWarning: ‘-’ not meaningful for factors
autoplot(modelb)
summary(modelb)
Warning: ‘^’ not meaningful for factors
Call:
lm(formula = neighbourhood_rating ~ satisfaction_with_nearest_green_space,
data = aggregate_responses)
Residuals:
Error in quantile.default(resid) : (unordered) factors are not allowed
neighbourhood_rating_joined %>%
count(walking_distance_to_nearest_greenspace)
#factor neighbourhood rating dataset
neighbourhood_rating_joined %>%
select(value, neighbourhood_rating,
gender, urban_rural_classification, simd_quintiles, type_of_tenure, household_type, ethnicity, walking_distance_to_nearest_greenspace)
filter(neighbourhood_rating != "All",
gender != "All",
urban_rural_classification != "All",
simd_quintiles != "All",
type_of_tenure != "All",
household_type != "All",
ehthnicity != "All",
walking_distance_to_nearest_greenspace != "All")
Error in UseMethod("filter") :
no applicable method for 'filter' applied to an object of class "c('matrix', 'array', 'logical')"
neighbourhood_rating_joined
neighbourhood_rating_joined %>%
filter(neighbourhood_rating != "All" |
walking_distance_to_nearest_greenspace != "All" |
gender != "All") %>%
count(gender)
# simd_quintiles != "All",
#type_of_tenure != "All",
#household_type != "All",
#ehthnicity != "All",
#walking_distance_to_nearest_greenspace != "All"
#urban_rural_classification != "All")
modelc <- lm(formula = neighbourhood_rating ~ distance_to_nearest_green_space, data = aggregate_responses)
Warning: using type = "numeric" with a factor response will be ignoredWarning: ‘-’ not meaningful for factors
autoplot(modelc)
modeld <- lm(formula = neighbourhood_rating ~ community_belonging, data = aggregate_responses)
Warning: using type = "numeric" with a factor response will be ignoredWarning: ‘-’ not meaningful for factors
autoplot(modelc)
modele <- lm(formula = neighbourhood_rating ~ walking_distance_to_nearest_greenspace, data = neighbourhood_rating_joined)
Warning: NAs introduced by coercionError in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
NA/NaN/Inf in 'y'
neighbourhood_rating_joined
neighbourhood_rating_joined %>%
count(walking_distance_to_nearest_greenspace)
neighbourhood_and_walking <- neighbourhood_rating_joined %>%
select(neighbourhood_rating, walking_distance_to_nearest_greenspace) %>%
mutate(neighbourhood_rating = factor(neighbourhood_rating, levels = c("Very poor",
"Fairly poor",
"No opinion",
"Fairly good",
"Very good")),
walking_distance_to_nearest_greenspace = factor(walking_distance_to_nearest_greenspace, levels =
c("Less than 10 minutes",
"More than 10 minutes",
"All")))
aggregate_responses
neighbourhood_filtered <- neighbourhood_and_walking %>%
filter(walking_distance_to_nearest_greenspace != "All")
modele <- lm(formula = neighbourhood_rating ~ walking_distance_to_nearest_greenspace, data = neighbourhood_filtered)
Warning: using type = "numeric" with a factor response will be ignoredWarning: ‘-’ not meaningful for factors
autoplot(modele)
aggregate_responses
aggregate_responses %>%
count(household_size)
summary(modelf)
Warning: ‘^’ not meaningful for factors
Call:
lm(formula = neighbourhood_rating ~ community_belonging, data = aggregate_responses)
Residuals:
Error in quantile.default(resid) : (unordered) factors are not allowed
summary(modelg)
Call:
lm(formula = household_size ~ distance_to_nearest_green_space,
data = aggregate_responses)
Residuals:
Min 1Q Median 3Q Max
-1.2628 -1.0999 -0.2628 0.7372 7.7372
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.26281 0.00636 355.793 < 2e-16 ***
distance_to_nearest_green_spaceDon't know -0.58924 0.06075 -9.700 < 2e-16 ***
distance_to_nearest_green_spaceMore than a 30 minute walk away -0.21721 0.02887 -7.523 5.44e-14 ***
distance_to_nearest_green_spaceWithin a 21-30 minute walk -0.19382 0.02640 -7.340 2.16e-13 ***
distance_to_nearest_green_spaceWithin a 6-10 minute walk -0.16292 0.01213 -13.435 < 2e-16 ***
distance_to_nearest_green_spaceWithin an 11-20 minute walk -0.25214 0.01638 -15.393 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.187 on 58524 degrees of freedom
Multiple R-squared: 0.00792, Adjusted R-squared: 0.007836
F-statistic: 93.45 on 5 and 58524 DF, p-value: < 2.2e-16
aggregate_responses
library(GGally)
ggpairs(aggregate_responses)
aggregate_responses %>%
ggplot(aes(x = neighbourhood_rating, y = distance_to_nearest_green_space)) +
geom_point()+
geom_smooth(method = "lm", se = FALSE)
aggregate_responses
aggregate_responses %>%
summarise(cor = (neighbourhood_rating, distance_to_nearest_green_space))
Error: unexpected ',' in:
"aggregate_responses %>%
summarise(cor = (neighbourhood_rating,"
#so maybe the problem is that I have factored my variable? Do I need to factor it?
modelh <- lm(formula = household_size ~ neighbourhood_rating, data = aggregate_responses_adjusted)
autoplot(modelh)
summary(modelh)
Call:
lm(formula = household_size ~ neighbourhood_rating, data = aggregate_responses_adjusted)
Residuals:
Min 1Q Median 3Q Max
-1.2207 -1.1531 -0.2207 0.7793 7.7793
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.0516917 0.0364949 56.219 < 2e-16 ***
neighbourhood_ratingFairly poor 0.0007482 0.0429878 0.017 0.98611
neighbourhood_ratingNo opinion -0.2885338 0.0937572 -3.077 0.00209 **
neighbourhood_ratingFairly good 0.1014425 0.0373329 2.717 0.00658 **
neighbourhood_ratingVery good 0.1690407 0.0371038 4.556 5.23e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.19 on 58525 degrees of freedom
Multiple R-squared: 0.001974, Adjusted R-squared: 0.001905
F-statistic: 28.93 on 4 and 58525 DF, p-value: < 2.2e-16
modeli <- lm(formula = household_size ~ community_belonging, data = aggregate_responses_adjusted)
autoplot(modeli)
summary(modeli)
Call:
lm(formula = household_size ~ community_belonging, data = aggregate_responses_adjusted)
Residuals:
Min 1Q Median 3Q Max
-1.2441 -1.1238 -0.1791 0.7559 7.7559
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.99492 0.01895 105.248 < 2e-16 ***
community_belongingNot very strongly 0.12888 0.02232 5.775 7.76e-09 ***
community_belongingDon't know 0.02765 0.05307 0.521 0.602
community_belongingFairly strongly 0.24916 0.02047 12.171 < 2e-16 ***
community_belongingVery strongly 0.18422 0.02072 8.889 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.19 on 58525 degrees of freedom
Multiple R-squared: 0.00335, Adjusted R-squared: 0.003282
F-statistic: 49.18 on 4 and 58525 DF, p-value: < 2.2e-16
modelg <- lm(formula = household_size ~ distance_to_nearest_green_space, data = aggregate_responses_adjusted)
autoplot(modelg)
summary(modelg)
Call:
lm(formula = household_size ~ distance_to_nearest_green_space,
data = aggregate_responses_adjusted)
Residuals:
Min 1Q Median 3Q Max
-1.2628 -1.0999 -0.2628 0.7372 7.7372
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.26281 0.00636 355.793 < 2e-16 ***
distance_to_nearest_green_spaceWithin a 6-10 minute walk -0.16292 0.01213 -13.435 < 2e-16 ***
distance_to_nearest_green_spaceWithin an 11-20 minute walk -0.25214 0.01638 -15.393 < 2e-16 ***
distance_to_nearest_green_spaceWithin a 21-30 minute walk -0.19382 0.02640 -7.340 2.16e-13 ***
distance_to_nearest_green_spaceMore than a 30 minute walk away -0.21721 0.02887 -7.523 5.44e-14 ***
distance_to_nearest_green_spaceDon't know -0.58924 0.06075 -9.700 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.187 on 58524 degrees of freedom
Multiple R-squared: 0.00792, Adjusted R-squared: 0.007836
F-statistic: 93.45 on 5 and 58524 DF, p-value: < 2.2e-16
aggregate_responses_adjusted %>%
ggplot(aes(x = household_size, y = neighbourhood_rating)) +
geom_point()+
geom_smooth(method = "lm", se = FALSE)
aggregate_responses_adjusted %>%
summarise(cor = (household_size, neighbourhood_rating))
Error: unexpected ',' in:
"aggregate_responses_adjusted %>%
summarise(cor = (household_size,"
aggregate_responses_adjusted %>%
ggplot(aes(x = neighbourhood_rating, y = household_size))+
geom_col()
aggregate_responses
library(fastDummies)
Warning: package ‘fastDummies’ was built under R version 4.2.1
aggregate_responses%>%
select(select(c(community_belonging, neighbourhood_rating,
distance_to_nearest_green_space, satisfaction_with_nearest_green_space,
age, gender, economic_status, household_size, highest_education_level)))
Error in `select()`:
! object 'distance_to_nearest_green_space' not found
Backtrace:
1. aggregate_responses %>% ...
24. base::.handleSimpleError(...)
25. rlang (local) h(simpleError(msg, call))
26. handlers[[1L]](cnd)
#creating dummy variables for regression
aggregate_responses_dummies <- aggregate_responses %>%
select(community_belonging, neighbourhood_rating, distance_to_nearest_green_space, satisfaction_with_nearest_green_space,
age, gender, economic_status, household_size, highest_education_level) %>%
fastDummies::dummy_cols(select_columns = "neighbourhood_rating", remove_first_dummy = TRUE, remove_selected_columns = TRUE) %>%
fastDummies::dummy_cols(select_columns = "community_belonging", remove_first_dummy = TRUE, remove_selected_columns = TRUE) %>% fastDummies::dummy_cols(select_columns = "distance_to_nearest_green_space", remove_first_dummy = TRUE, remove_selected_columns = TRUE) %>%
fastDummies::dummy_cols(select_columns = "satisfaction_with_nearest_green_space", remove_first_dummy = TRUE, remove_selected_columns = TRUE) %>%
fastDummies::dummy_cols(select_columns = "age", remove_first_dummy = TRUE, remove_selected_columns = TRUE) %>%
fastDummies::dummy_cols(select_columns = "gender", remove_first_dummy = TRUE, remove_selected_columns = TRUE) %>%
fastDummies::dummy_cols(select_columns = "economic_status", remove_first_dummy = TRUE, remove_selected_columns = TRUE) %>%
fastDummies::dummy_cols(select_columns = "highest_education_level", remove_first_dummy = TRUE, remove_selected_columns = TRUE)
#using clean names
aggregate_responses_dummies <- aggregate_responses_dummies %>%
clean_names()
aggregate_responses_dummies
#only keep higher and lower values for my regression analysis
aggregate_responses_trim <- aggregate_responses_dummies %>%
select(household_size,
neighbourhood_rating_very_good,
neighbourhood_rating_very_poor,
community_belonging_very_strongly,
community_belonging_not_at_all_strongly,
distance_to_nearest_green_space_more_than_a_30_minute_walk_away,
distance_to_nearest_green_space_within_a_6_10_minute_walk,
satisfaction_with_nearest_green_space_very_satisfied,
satisfaction_with_nearest_green_space_very_dissatisfied,
age_35_64_years,
age_65_years,
gender_male
)
aggregate_responses_trim
model1a <- lm(formula = neighbourhood_rating_very_good ~ distance_to_nearest_green_space_within_a_6_10_minute_walk, data = aggregate_responses_trim)
summary(model1a)
Call:
lm(formula = neighbourhood_rating_very_good ~ distance_to_nearest_green_space_within_a_6_10_minute_walk,
data = aggregate_responses_trim)
Residuals:
Min 1Q Median 3Q Max
-0.5494 -0.5494 0.4506 0.4506 0.4912
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.549453 0.002340 234.82 <2e-16 ***
distance_to_nearest_green_space_within_a_6_10_minute_walk -0.040674 0.004924 -8.26 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4981 on 58528 degrees of freedom
Multiple R-squared: 0.001164, Adjusted R-squared: 0.001147
F-statistic: 68.22 on 1 and 58528 DF, p-value: < 2.2e-16
#very good neighbourhood regression
model1b <- lm(formula = neighbourhood_rating_very_good ~ satisfaction_with_nearest_green_space_very_satisfied +
distance_to_nearest_green_space_within_a_6_10_minute_walk+
age_65_years,
data = aggregate_responses_trim)
summary(model1b)
Call:
lm(formula = neighbourhood_rating_very_good ~ satisfaction_with_nearest_green_space_very_satisfied +
distance_to_nearest_green_space_within_a_6_10_minute_walk +
age_65_years, data = aggregate_responses_trim)
Residuals:
Min 1Q Median 3Q Max
-0.7712 -0.4571 0.2288 0.4445 0.5730
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.457119 0.003231 141.496 < 2e-16 ***
satisfaction_with_nearest_green_space_very_satisfied 0.215689 0.004659 46.296 < 2e-16 ***
distance_to_nearest_green_space_within_a_6_10_minute_walk -0.030166 0.005303 -5.688 1.29e-08 ***
age_65_years 0.098342 0.005062 19.429 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4848 on 48538 degrees of freedom
(9988 observations deleted due to missingness)
Multiple R-squared: 0.05103, Adjusted R-squared: 0.05097
F-statistic: 870 on 3 and 48538 DF, p-value: < 2.2e-16
#ok, better R but not quite there yet
#very good neighbourhood regression
model1 <- lm(formula = neighbourhood_rating_very_good ~ , data = aggregate_responses_trim)
summary(model1c)
modell <- lm(formula = household_size ~ neighbourhood_rating_very_poor, data = aggregate_responses_dummies)
summary(modell)
autoplot(modell)
aggregate_responses_dummies
modelm <- glm(formula = neighbourhood_rating_very_good ~ community_belonging_very_strongly, data = aggregate_responses_dummies)
summary(modelm)
autoplot(modelm)
modelm <- lm(formula = neighbourhood_rating_very_good ~ community_belonging_very_strongly + , data = aggregate_responses_dummies)
summary(modelm)
autoplot(modelm)
#attempting decision tree
#very good neighbourhood regression
model1c <- lm(formula = neighbourhood_rating_very_good ~ satisfaction_with_nearest_green_space_very_satisfied +
distance_to_nearest_green_space_within_a_6_10_minute_walk+
age_65_years +
community_belonging_very_strongly+
community_belonging_fairly_strongly+
satisfaction_with_nearest_green_space_fairly_satisfied+
distance_to_nearest_green_space_within_an_11_20_minute_walk,
data = aggregate_responses_dummies)
summary(model1c)
Call:
lm(formula = neighbourhood_rating_very_good ~ satisfaction_with_nearest_green_space_very_satisfied +
distance_to_nearest_green_space_within_a_6_10_minute_walk +
age_65_years + community_belonging_very_strongly + community_belonging_fairly_strongly +
satisfaction_with_nearest_green_space_fairly_satisfied +
distance_to_nearest_green_space_within_an_11_20_minute_walk,
data = aggregate_responses_dummies)
Residuals:
Min 1Q Median 3Q Max
-0.8972 -0.4397 0.1509 0.3674 0.8015
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.231645 0.005622 41.206 < 2e-16 ***
satisfaction_with_nearest_green_space_very_satisfied 0.208030 0.005434 38.281 < 2e-16 ***
distance_to_nearest_green_space_within_a_6_10_minute_walk -0.028242 0.005108 -5.529 3.24e-08 ***
age_65_years 0.048057 0.004851 9.907 < 2e-16 ***
community_belonging_very_strongly 0.409425 0.005621 72.844 < 2e-16 ***
community_belonging_fairly_strongly 0.192933 0.005380 35.863 < 2e-16 ***
satisfaction_with_nearest_green_space_fairly_satisfied 0.053538 0.005229 10.239 < 2e-16 ***
distance_to_nearest_green_space_within_an_11_20_minute_walk -0.033114 0.007028 -4.712 2.46e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4589 on 48534 degrees of freedom
(9988 observations deleted due to missingness)
Multiple R-squared: 0.1499, Adjusted R-squared: 0.1497
F-statistic: 1222 on 7 and 48534 DF, p-value: < 2.2e-16
aggregate_responses_dummies
model2c <- lm(formula = neighbourhood_rating_very_good ~ satisfaction_with_nearest_green_space_very_satisfied +
distance_to_nearest_green_space_within_a_6_10_minute_walk+
community_belonging_very_strongly+
community_belonging_fairly_strongly,
data = aggregate_responses_dummies)
summary(model2c)
Call:
lm(formula = neighbourhood_rating_very_good ~ satisfaction_with_nearest_green_space_very_satisfied +
distance_to_nearest_green_space_within_a_6_10_minute_walk +
community_belonging_very_strongly + community_belonging_fairly_strongly,
data = aggregate_responses_dummies)
Residuals:
Min 1Q Median 3Q Max
-0.8612 -0.4419 0.1388 0.3590 0.7581
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.264004 0.004616 57.20 < 2e-16 ***
satisfaction_with_nearest_green_space_very_satisfied 0.177025 0.004455 39.74 < 2e-16 ***
distance_to_nearest_green_space_within_a_6_10_minute_walk -0.022084 0.005030 -4.39 1.13e-05 ***
community_belonging_very_strongly 0.420159 0.005568 75.46 < 2e-16 ***
community_belonging_fairly_strongly 0.199930 0.005370 37.23 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4599 on 48537 degrees of freedom
(9988 observations deleted due to missingness)
Multiple R-squared: 0.146, Adjusted R-squared: 0.146
F-statistic: 2075 on 4 and 48537 DF, p-value: < 2.2e-16
model3c <- lm(formula = neighbourhood_rating_very_good ~ satisfaction_with_nearest_green_space_very_satisfied +
distance_to_nearest_green_space_within_a_6_10_minute_walk+
community_belonging_very_strongly+
community_belonging_fairly_strongly+
highest_education_level_standard_grade_or_equiv_svq_level_1_or_2,
data = aggregate_responses_dummies)
summary(model3c)
Call:
lm(formula = neighbourhood_rating_very_good ~ satisfaction_with_nearest_green_space_very_satisfied +
distance_to_nearest_green_space_within_a_6_10_minute_walk +
community_belonging_very_strongly + community_belonging_fairly_strongly +
highest_education_level_standard_grade_or_equiv_svq_level_1_or_2,
data = aggregate_responses_dummies)
Residuals:
Min 1Q Median 3Q Max
-0.8995 -0.4642 0.1004 0.3585 0.8164
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.290179 0.005240 55.379 < 2e-16 ***
satisfaction_with_nearest_green_space_very_satisfied 0.174072 0.004881 35.661 < 2e-16 ***
distance_to_nearest_green_space_within_a_6_10_minute_walk -0.022599 0.005572 -4.056 5.01e-05 ***
community_belonging_very_strongly 0.435300 0.006159 70.672 < 2e-16 ***
community_belonging_fairly_strongly 0.207914 0.005854 35.520 < 2e-16 ***
highest_education_level_standard_grade_or_equiv_svq_level_1_or_2 -0.084009 0.005336 -15.744 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4554 on 39265 degrees of freedom
(19259 observations deleted due to missingness)
Multiple R-squared: 0.1589, Adjusted R-squared: 0.1588
F-statistic: 1484 on 5 and 39265 DF, p-value: < 2.2e-16
aggregate_responses_dummies
model4c <- lm(formula = neighbourhood_rating_very_good ~ community_belonging_very_strongly+
satisfaction_with_nearest_green_space_very_satisfied+
highest_education_level_standard_grade_or_equiv_svq_level_1_or_2+
community_belonging_fairly_strongly,
data = aggregate_responses_dummies)
summary(model4c)
Call:
lm(formula = neighbourhood_rating_very_good ~ community_belonging_very_strongly +
satisfaction_with_nearest_green_space_very_satisfied + highest_education_level_standard_grade_or_equiv_svq_level_1_or_2 +
community_belonging_fairly_strongly, data = aggregate_responses_dummies)
Residuals:
Min 1Q Median 3Q Max
-0.8956 -0.4599 0.1044 0.3639 0.7996
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.284774 0.005069 56.18 <2e-16 ***
community_belonging_very_strongly 0.435719 0.006160 70.74 <2e-16 ***
satisfaction_with_nearest_green_space_very_satisfied 0.175111 0.004876 35.92 <2e-16 ***
highest_education_level_standard_grade_or_equiv_svq_level_1_or_2 -0.084380 0.005336 -15.81 <2e-16 ***
community_belonging_fairly_strongly 0.208052 0.005855 35.54 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4555 on 39266 degrees of freedom
(19259 observations deleted due to missingness)
Multiple R-squared: 0.1586, Adjusted R-squared: 0.1585
F-statistic: 1850 on 4 and 39266 DF, p-value: < 2.2e-16
#logistic regression
model5c <- glm(formula = neighbourhood_rating_very_good ~ community_belonging_very_strongly+
satisfaction_with_nearest_green_space_very_satisfied+
highest_education_level_standard_grade_or_equiv_svq_level_1_or_2+
community_belonging_fairly_strongly,
data = aggregate_responses_dummies, family = binomial(link = "logit"))
summary(model5c)
Call:
glm(formula = neighbourhood_rating_very_good ~ community_belonging_very_strongly +
satisfaction_with_nearest_green_space_very_satisfied + highest_education_level_standard_grade_or_equiv_svq_level_1_or_2 +
community_belonging_fairly_strongly, family = binomial(link = "logit"),
data = aggregate_responses_dummies)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.0083 -1.1302 0.5345 0.9245 1.7810
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.95419 0.02467 -38.69 <2e-16 ***
community_belonging_very_strongly 1.98581 0.03100 64.06 <2e-16 ***
satisfaction_with_nearest_green_space_very_satisfied 0.84208 0.02401 35.07 <2e-16 ***
highest_education_level_standard_grade_or_equiv_svq_level_1_or_2 -0.40262 0.02565 -15.70 <2e-16 ***
community_belonging_fairly_strongly 0.90574 0.02761 32.81 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 53901 on 39270 degrees of freedom
Residual deviance: 47255 on 39266 degrees of freedom
(19259 observations deleted due to missingness)
AIC: 47265
Number of Fisher Scoring iterations: 4
library(pROC)
roc_obj_4_pred <- neighbourhood_rating_4_pred %>%
roc(response = neighbourhood_rating_very_good, predictor = pred)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
roc_curve <- ggroc(data = roc_obj_4_pred, legacy.axes = TRUE)+
coord_fixed()+
ylab("sensitivity (True Positive Rate)")+
xlab("1-specificity (True Negative Rate)")
roc_curve
#The closer the curve is to the top left corner, the more effective the classifier. The ‘perfect classifier’ would have a TPR of 1.0 and an FPR of 0.0. This corresponds to the top-left point on the chart! It’s possible for a classifier to have a curve ‘below’ the diagonal, in which case, it performs more poorly than random guessing. Quite an achievement! The upper right corner of the curves corresponds to low threshold probability (i.e. ’classify everything as positive). The lower left corner therefore corresponds to high threshold probability. If our ROC curve has an intercept of 0.5 sensitivity = specificty.
aggregate_responses_dummies
model6c <- glm(formula = neighbourhood_rating_very_good ~ community_belonging_not_very_strongly+
satisfaction_with_nearest_green_space_very_satisfied+
highest_education_level_standard_grade_or_equiv_svq_level_1_or_2+
community_belonging_fairly_strongly,
data = aggregate_responses_dummies, family = binomial(link = "logit"))
summary(model6c)
Call:
glm(formula = neighbourhood_rating_very_good ~ community_belonging_not_very_strongly +
satisfaction_with_nearest_green_space_very_satisfied + highest_education_level_standard_grade_or_equiv_svq_level_1_or_2 +
community_belonging_fairly_strongly, family = binomial(link = "logit"),
data = aggregate_responses_dummies)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.8183 -1.1527 0.6520 0.9463 1.7383
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.57131 0.02036 28.06 <2e-16 ***
community_belonging_not_very_strongly -1.43923 0.03157 -45.58 <2e-16 ***
satisfaction_with_nearest_green_space_very_satisfied 0.86928 0.02327 37.36 <2e-16 ***
highest_education_level_standard_grade_or_equiv_svq_level_1_or_2 -0.39360 0.02478 -15.88 <2e-16 ***
community_belonging_fairly_strongly -0.62972 0.02369 -26.59 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 53901 on 39270 degrees of freedom
Residual deviance: 49625 on 39266 degrees of freedom
(19259 observations deleted due to missingness)
AIC: 49635
Number of Fisher Scoring iterations: 4
anova(model5c, model6c, test = "Chisq")
Analysis of Deviance Table
Model 1: neighbourhood_rating_very_good ~ community_belonging_very_strongly +
satisfaction_with_nearest_green_space_very_satisfied + highest_education_level_standard_grade_or_equiv_svq_level_1_or_2 +
community_belonging_fairly_strongly
Model 2: neighbourhood_rating_very_good ~ community_belonging_not_very_strongly +
satisfaction_with_nearest_green_space_very_satisfied + highest_education_level_standard_grade_or_equiv_svq_level_1_or_2 +
community_belonging_fairly_strongly
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1 39266 47255
2 39266 49625 0 -2369.8
auc(roc_obj_4_pred)
Area under the curve: 0.7282